example <- jsonlite::fromJSON(here("data", "BCC", "traffic", "raw", "temp_02",
"traffic-data-at-int-201809060626.json"),
flatten = TRUE)skim(example) %>%
skimr::kable()Skim summary statistics
n obs: 2480
n variables: 20
| variable | missing | complete | n | min | max | empty | n_unique |
|---|---|---|---|---|---|---|---|
| lane | 0 | 2480 | 2480 | 4 | 6 | 0 | 705 |
| married | 0 | 2480 | 2480 | 1 | 1 | 0 | 2 |
| recorded | 0 | 2480 | 2480 | 19 | 19 | 0 | 2 |
| variable | missing | complete | n | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| ct | 0 | 2480 | 2480 | 80.44 | 28.29 | 20 | 60 | 80 | 96.5 | 150 | |
| dbid | 0 | 2480 | 2480 | 1.3e+09 | 154.54 | 1.3e+09 | 1.3e+09 | 1.3e+09 | 1.3e+09 | 1.3e+09 | |
| ds1 | 21 | 2459 | 2480 | 18.85 | 24.39 | 0 | 0 | 9 | 31 | 146 | |
| ds2 | 1317 | 1163 | 2480 | 27.07 | 26.04 | 0 | 0 | 23 | 41 | 167 | |
| ds3 | 2298 | 182 | 2480 | 32.43 | 29.05 | 0 | 10.25 | 25 | 46.75 | 114 | |
| ds4 | 2473 | 7 | 2480 | 21.29 | 36 | 0 | 0 | 0 | 31 | 87 | |
| link_plan | 0 | 2480 | 2480 | 1.35 | 0.78 | 0 | 1 | 1 | 2 | 3 | |
| mf1 | 21 | 2459 | 2480 | 3.76 | 6.09 | 0 | 0 | 2 | 5 | 52 | |
| mf2 | 1317 | 1163 | 2480 | 6.98 | 8.6 | 0 | 0 | 4 | 10 | 58 | |
| mf3 | 2298 | 182 | 2480 | 11.15 | 12.06 | 0 | 3 | 8 | 15 | 60 | |
| mf4 | 2473 | 7 | 2480 | 7.14 | 16.32 | 0 | 0 | 0 | 3 | 44 | |
| rf1 | 21 | 2459 | 2480 | 3.49 | 6.04 | 0 | 0 | 1 | 4 | 53 | |
| rf2 | 1317 | 1163 | 2480 | 6.59 | 8.53 | 0 | 0 | 4 | 9 | 55 | |
| rf3 | 2298 | 182 | 2480 | 10.79 | 12.01 | 0 | 2 | 7 | 14 | 59 | |
| rf4 | 2473 | 7 | 2480 | 6.71 | 15.23 | 0 | 0 | 0 | 3 | 41 | |
| ss | 0 | 2480 | 2480 | 2482.38 | 271.89 | 2065 | 2228 | 2443.5 | 2775 | 3047 | |
| tsc | 0 | 2480 | 2480 | 1295.85 | 2172.02 | 9 | 484 | 650.5 | 865 | 9011 |
gg_miss_var(example, show_pct = TRUE)example <- here("data", "BCC", "traffic", "raw", "temp_02") %>%
fs::dir_ls(regexp = "\\.json$") %>%
map_df(jsonlite::fromJSON, flatten = TRUE, .id = "source")
saveRDS(example, here("data", "BCC", "traffic", "clean", "example.Rds"))example %<>%
select(-source) %>%
mutate(recorded = anytime(recorded))
summary(example$recorded)## Min. 1st Qu. Median
## "2018-09-06 00:00:00" "2018-09-06 11:02:00" "2018-09-06 23:42:00"
## Mean 3rd Qu. Max.
## "2018-09-06 23:15:07" "2018-09-07 10:46:00" "2018-09-08 00:00:00"
TODO: identify the gaps
TODO: find largest period without data
TODO: fill in the gaps
There are 16113 duplicated rows in the raw data - all occurences were removed
# isUnique(example$dbid)
example %<>%
distinct()Example from one intersection in St Lucia (tsc == 8074).
Measured flow (mf) aggregated for all lanes.
traffic_8074 <- example %>%
filter(tsc == 8074) %>%
mutate(ds = rowSums(select(., contains("ds")), na.rm = TRUE)) %>%
mutate(mf = rowSums(select(., contains("mf")), na.rm = TRUE)) %>%
mutate(rf = rowSums(select(., contains("rf")), na.rm = TRUE)) %>%
group_by(recorded, tsc) %>%
summarise(ds = sum(ds),
mf = sum(mf),
rf = sum(rf))All (ie. having lat long) across 2 days:
traffic_agg <- example %>%
mutate(ds = rowSums(select(., contains("ds")), na.rm = TRUE)) %>%
mutate(mf = rowSums(select(., contains("mf")), na.rm = TRUE)) %>%
mutate(rf = rowSums(select(., contains("rf")), na.rm = TRUE)) %>%
group_by(recorded, tsc) %>%
summarise(ds = sum(ds),
mf = sum(mf),
rf = sum(rf)) %>%
ungroup()
traffic_agg <- left_join(traffic_agg,
select(intersect, tsc, coordinates.latLng.latitude, coordinates.latLng.longitude)) %>%
filter(!is.na(coordinates.latLng.longitude) & !is.na(coordinates.latLng.latitude))## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Traffic aggregates across city at 8 AM:
traffic_agg %>%
filter(recorded == "2018-09-06 08:00:00") %>%
ggplot() +
geom_point(aes(x=coordinates.latLng.longitude, y=coordinates.latLng.latitude, size=mf),
colour = "darkgreen", alpha = 0.5, shape=20, stroke=FALSE) +
scale_size_continuous(name="Measured flow", range=c(1, 12)) +
theme_void() + coord_map() Traffic aggregates across city for one day:
library(gganimate)
traffic_agg %>%
ggplot() +
geom_point(aes(x=coordinates.latLng.longitude, y=coordinates.latLng.latitude, size=mf),
colour = "darkgreen", alpha = 0.5, shape=20, stroke=FALSE) +
scale_size_continuous(name="Measured flow", range=c(1, 12)) +
theme_void() + coord_map() +
labs(title = 'Time: {frame_time}') +
transition_time(recorded)There are 28569 files collected so far - better processing or HPC is needed for efficient load?
In a dataset collected so far we can find some small files:
##
## Variable ¦ Obs Missing Mean StdDev Min Max
## ---------+-------------------------------------------------------
## size ¦ 28569 0 672963 168293 50604 1.2e+07
Using some arbitrary treshold we can get rid of the small stuff?
table(file.info(files)$size < 17000)##
## FALSE
## 28569
** TODO: They can be either manually deleted, selected in R, or processed in bash?**
Bash code?
sudo find . -name "*.json" -size -17k
sudo find . -name "*.json" -size -17k -delete
sessionInfo()## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 15063)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_Australia.1252 LC_CTYPE=English_Australia.1252
## [3] LC_MONETARY=English_Australia.1252 LC_NUMERIC=C
## [5] LC_TIME=English_Australia.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 tmaptools_2.0-1 tmap_2.1-1 sf_0.6-3
## [5] kableExtra_0.9.0 janitor_1.1.1 naniar_0.4.0.0 statar_0.6.5
## [9] skimr_1.0.3 fs_1.2.6 jsonlite_1.5 anytime_0.3.1
## [13] magrittr_1.5 readxl_1.1.0 here_0.1 forcats_0.3.0
## [17] stringr_1.3.1 dplyr_0.7.6 purrr_0.2.5 readr_1.1.1
## [21] tidyr_0.8.1 tibble_1.4.2 ggplot2_3.0.0 tidyverse_1.2.1
## [25] pacman_0.4.6
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-137 matrixStats_0.54.0 satellite_1.0.1
## [4] lubridate_1.7.4 webshot_0.5.1 RColorBrewer_1.1-2
## [7] httr_1.3.1 rprojroot_1.3-2 mapview_2.6.0
## [10] tools_3.5.1 backports_1.1.2 rgdal_1.3-4
## [13] R6_2.3.0 KernSmooth_2.23-15 mgcv_1.8-24
## [16] rgeos_0.3-28 spData_0.2.9.4 DBI_1.0.0
## [19] lazyeval_0.2.1 colorspace_1.3-2 raster_2.6-7
## [22] withr_2.1.2 sp_1.3-1 tidyselect_0.2.5
## [25] leaflet_2.0.2 compiler_3.5.1 cli_1.0.1
## [28] rvest_0.3.2 xml2_1.2.0 labeling_0.3
## [31] scales_1.0.0 classInt_0.2-3 RApiDatetime_0.0.3
## [34] digest_0.6.18 rmarkdown_1.10 base64enc_0.1-3
## [37] dichromat_2.0-0 pkgconfig_2.0.2 htmltools_0.3.6
## [40] maps_3.3.0 highr_0.7 htmlwidgets_1.3
## [43] rlang_0.2.2 rstudioapi_0.8 shiny_1.1.0
## [46] bindr_0.1.1 crosstalk_1.0.0 Matrix_1.2-14
## [49] Rcpp_0.12.19 munsell_0.5.0 visdat_0.5.1
## [52] stringi_1.2.4 yaml_2.2.0 plyr_1.8.4
## [55] grid_3.5.1 parallel_3.5.1 promises_1.0.1
## [58] crayon_1.3.4 lattice_0.20-35 haven_1.1.2
## [61] mapproj_1.2.6 hms_0.4.2 knitr_1.20
## [64] pillar_1.3.0 stats4_3.5.1 XML_3.98-1.16
## [67] glue_1.3.0 evaluate_0.12 data.table_1.11.8
## [70] modelr_0.1.2 png_0.1-7 httpuv_1.4.5
## [73] cellranger_1.1.0 gtable_0.2.0 assertthat_0.2.0
## [76] mime_0.6 lwgeom_0.1-4 xtable_1.8-3
## [79] broom_0.5.0 e1071_1.7-0 later_0.7.5
## [82] class_7.3-14 viridisLite_0.3.0 units_0.6-1